Packages

library(nycflights13)
library(mosaicData)
library(janitor)
library(tidyr)
library(broom)
library(mdsr)
library(RColorBrewer)
library(osmdata)
library(ggmap)
library(zipcodeR)
library(tidyverse)
library(paletteer)
library(gt)

1. Use the nycflights13 package to answer the following questions:

(a) (Excerpted from Problem 4.9 in textbook) Use the flights table. What month had the highest proportion of cancelled flights (flights with missing departure/arrival delay time)? What month had the lowest? Interpret any seasonal patterns.

Here I defined a cancelled flights as a flight that never left, didn’t have any air time and that never arrived to its destination.

# Find data with na's
cancelled_flights <-
    flights %>% 
    filter(is.na(flights$dep_time) & is.na(flights$arr_time) & 
               is.na(flights$air_time)) %>% 
    select(month,day,arr_time,air_time,dep_time)
#Find flights cancelled
number_flights_cancelled <- 
  cancelled_flights %>% 
  group_by(month) %>% 
  summarise(n_flights_cancelled = n())

# Find total_number_flights
total_number_flights <- 
  flights %>% 
  group_by(year,month) %>% 
  summarise(n_flights_total = n())

# Join tables
proportion_flights_cancelled <-
  inner_join(number_flights_cancelled,total_number_flights,
             by = c("month"))
proportion_flights_cancelled %>%
    mutate(proportion_flight_cancelled = 
                 ((n_flights_cancelled/n_flights_total))) %>% 
    round(2) %>% 
    arrange(desc(proportion_flight_cancelled)) %>% 
    rename(Month = "month",flights_cancelled = "n_flights_cancelled",
           Total_flights = "n_flights_total",
           Proportion_flights_cancelled = "proportion_flight_cancelled") %>%
    select(year,everything()) %>% 
    gt() %>% 
    fmt_percent(Proportion_flights_cancelled, decimals = 0)
year Month flights_cancelled Total_flights Proportion_flights_cancelled
2013 2 1261 24951 5%
2013 6 1009 28243 4%
2013 12 1025 28135 4%
2013 3 861 28834 3%
2013 7 940 29425 3%
2013 1 521 27004 2%
2013 4 668 28330 2%
2013 5 563 28796 2%
2013 8 486 29327 2%
2013 9 452 27574 2%
2013 10 236 28889 1%
2013 11 233 27268 1%
  • The month with the highest proportion of cancelled flights was February probably because of winter conditions and the month with the lowest was October.

(b) Challenge: Use the weather table. On how many days was there precipitation in the New York area for each month in 2013? What do you observe in combine with the results from part (a)? (Hint: the distinct function in dplyr can be useful.)

weather %>% 
  group_by(year,month,day) %>% 
  mutate(
        origin = as.factor(origin),
        year = as.factor(year),
        month = as.factor(month),
        day = as.factor(day),
        hour = as.factor(hour)) %>% 
    summarize(across(where(is.numeric), mean , na.rm = T)) %>% 
    filter(precip > 0) %>% 
    group_by(month) %>% 
    summarise(days_with_rain = n()) %>% 
    arrange(desc(days_with_rain)) %>% 
    rename(Month = "month",Days_with_rain = "days_with_rain") %>% 
    gt() 
Month Days_with_rain
6 16
1 14
2 14
7 13
12 13
4 12
5 12
8 11
11 11
10 9
3 8
9 8

(Excerpted from Problem 5.4 in textbook) Use the flights and plane tables. What is the oldest plane (specified by the tailnum variable) that flew from New York City airports in 2013

flights %>%
  select(-year) %>% 
      inner_join(planes, by = c('tailnum' = 'tailnum')) %>% 
      select(tailnum,year, everything()) %>% 
      group_by(tailnum,year) %>% 
      summarise(Mean_airtime = mean(air_time)) %>%
      arrange(year) %>% 
      drop_na() %>% 
      head(5)
# A tibble: 5 × 3
# Groups:   tailnum [5]
  tailnum  year Mean_airtime
  <chr>   <int>        <dbl>
1 N381AA   1956        280. 
2 N14629   1965        208. 
3 N615AA   1967        180. 
4 N364AA   1973        306. 
5 N675MC   1975         95.8
  • The oldest planes is N381AA

2. Use the mosaicData package to answer the following questions:

(a) (Excerpted from Problem 6.6 in textbook) The HELPfull data contains information about the Health Evaluation and Linkage to Primary Care (HELP) randomized trial in tall format. Create a table that each row displays the DRUGRISK and SEXRISK scores at the baseline and 6 months for a subject ID. (Hint: See the textbook for breakdown steps.)

# Load data
data("HELPfull")
HELPfull %>% 
    clean_names()  %>% 
    filter(id == 1) %>% 
    filter(time %in% c(0,6)) %>% 
    pivot_longer(cols = c(drugrisk,sexrisk), 
                 names_to = "type_of_risk", 
                 values_to = "risk_score") %>% 
    select(id,type_of_risk,risk_score,time,everything()) %>% 
    arrange(type_of_risk) %>% 
    select(1:9) %>% 
    gt()
id type_of_risk risk_score time num_intervals int_time1 days_since_bl int_time2 days_since_prev
1 drugrisk 0 0 1 0.000000 NA 6.000000 NA
1 drugrisk 0 6 1 8.033333 241 8.033333 241
1 sexrisk 4 0 1 0.000000 NA 6.000000 NA
1 sexrisk 1 6 1 8.033333 241 8.033333 241

(b) (Excerpted from Problem 7.3 in textbook) Use the purrr::map() function and the HELPrct data frame to fit a regression model predicting cesd as a function of age separately for each of the levels of the substance variable. Generate a list of results (estimates and confidence intervals) for the slope parameter for each level of the grouping variable. (Hint: The tidy() function with the option conf.int = T computes confidence intervals for a lm() object.)

table <- 
HELPfull %>%
    clean_names() %>% 
    select(ces_d,age,secd_sub) %>% 
    na.omit() %>% 
    mutate(secd_sub = as.factor(secd_sub)) %>%
    nest(data = -secd_sub) %>% 
    mutate(
      fit = map(data, ~ lm(ces_d ~ age, data = .x)),
      tidied = map(fit, tidy, conf.int =T)) %>% 
    unnest(tidied) %>% 
    select(-c(data,fit)) %>% 
    filter(term %in% "age") %>% 
    clean_names() %>% 
    arrange(secd_sub) %>% 
    drop_na() 

levels(table$secd_sub) <- c("None","Alcohol","Cocaine" ,
                            "Heroine","Barbituates", "Benzos", 
                             "Marijuana", "Methadone", "Opiates")
table %>% gt()
secd_sub term estimate std_error statistic p_value conf_low conf_high
None age 0.13582038 0.1126537 1.2056446 0.22934428 -0.08628813 0.35792889
Alcohol age -0.28330979 0.1687487 -1.6788858 0.09598731 -0.61769658 0.05107701
Cocaine age 0.06950926 0.1565577 0.4439850 0.65819510 -0.24182287 0.38084139
Heroine age -0.11506198 0.3496772 -0.3290520 0.74613547 -0.85281631 0.62269236
Benzos age 0.05769718 0.3339358 0.1727793 0.86771459 -0.73193555 0.84732992
Marijuana age -0.28415183 0.3220931 -0.8822041 0.38492071 -0.94290610 0.37460244
Opiates age -0.91494845 0.8883456 -1.0299465 0.49060905 -12.20245011 10.37255320

3. Health Care Coverage Data

(a) Read the dataset from CSVs hosted on GitHub using read_csv().(Hint: you want to skip the first two lines with the skip option and read up to the 52nd state with the n_max option.)

link <- "https://raw.githubusercontent.com/opencasestudies/ocs-healthexpenditure/master/data/KFF/healthcare-coverage.csv"

data_csv <-  
    read.csv(file = link, skip = 2, nrows = 52, 
             na.strings = "N/A")  %>% 
    clean_names() 
    

# Remove x from colnames
colnames(data_csv) <- sub("^x", "", colnames(data_csv))

(b) Convert all year-based columns to integer using mutate(across(…)) . (Hint: Read Section 7.2 in the textbook for the across() function.)

data_csv <- 
  data_csv %>%   
      mutate(across(starts_with("201"), as.integer))

(c) Further tidy the dataset and convert it to a long data format as shown below.

data_csv <- 
    data_csv %>%
        pivot_longer(-location, names_to = "year", 
                     values_to = "tot_coverage") %>% 
        separate(year,c("year","type"),extra = "merge") %>% 
        mutate(year = factor(year),
               type = factor(type),
               type = str_to_sentence(type))

data_csv$type <-  sub("_", "-", data_csv$type)

head(data_csv, 10) %>% 
  gt()
location year type tot_coverage
United States 2013 Employer 155696900
United States 2013 Non-group 13816000
United States 2013 Medicaid 54919100
United States 2013 Medicare 40876300
United States 2013 Other-public 6295400
United States 2013 Uninsured 41795100
United States 2013 Total 313401200
United States 2014 Employer 154347500
United States 2014 Non-group 19313000
United States 2014 Medicaid 61650400

4. Challenge

The Violations data set in the mdsr package contains information regarding the outcome of health inspections of restaurants in New York City. The ViolationCodes data set includes violation description and classification of critical violations (violations that are most likely to contribute to foodborne illness). See original data source for more information: NYC Open Data. Use these data to calculate, by zip code, the average number of inspections per restaurant and the rate of critical violations. What pattern do you see between the average number of inspections per restaurant and the rate of critical violations? (Hint: Note that an inspection can appear across several rows in the Violations data set if multipleviolations were associated with the inspection. The rate of critical violations should be defined as the rate of inspections that result in at least one critical violation.)

# Load data
data("ViolationCodes")
data("Violations")
  • Data Aggregation process
# Join ViolationCodes and Violations datasets

data_nyc_restaurants <- 
    inner_join(ViolationCodes,Violations, 
             by = c("violation_code" = "violation_code")) %>% 
    
    # Clean dataset delete columns
    select(-c(violation_description,cuisine_code,phone,action,
              street,grade_date,boro,record_date,
              grade,inspection_type,violation_code,building))  %>% 
    
    drop_na() %>% 
    mutate(across(where(is.character), as.factor),
           camis = factor(camis)) %>% 
    select(zipcode,camis,dba,inspection_date,everything()) %>% 
    arrange(zipcode,camis,inspection_date)  
data_per_rest <-
        data_nyc_restaurants %>% 
        drop_na() %>% 
        group_by(zipcode,camis,inspection_date) %>%
        
        #Create critical flags column
        mutate(n_critical_flags = 
                   ifelse(critical_flag == "Critical",TRUE,FALSE)) %>%
        filter(n_critical_flags > 0) %>% 
    
        summarise(number_of_critical_flags = 
                    sum(n_critical_flags == 'TRUE'),
                score = mean(score)) %>% 
      
        arrange(zipcode,camis)
data_number_visits_per_rest <- 
    data_per_rest %>%
        # How many times a restaurant was visit?
        group_by(zipcode,camis) %>%
        summarise(number_visits = n(),
                  mean_score = mean(score),
                  total_number_of_critical_flags = 
                      sum(number_of_critical_flags)) %>% 
  
        arrange(zipcode,camis) %>% 
        select(zipcode,camis,
               total_number_of_critical_flags,everything())
data_agregated_by_zipcode <-
    data_number_visits_per_rest %>% 
    #select(-camis) %>% 
    group_by(zipcode) %>% 
    summarise(mean_inspector_visits = mean(number_visits),
              mean_inspector_score = mean(mean_score),
              mean_critical_violations = 
                  mean(total_number_of_critical_flags) ) %>% 
    arrange(zipcode)  
  • Average number of inspections per restaurant by zip code and the rate of critical violations.
data_agregated_by_zipcode %>% 
    head(10)
# A tibble: 10 × 4
   zipcode mean_inspector_visits mean_inspector_score mean_critical_violations
     <int>                 <dbl>                <dbl>                    <dbl>
 1   10001                  5.43                 14.5                    10.3 
 2   10002                  5.59                 16.1                    11.5 
 3   10003                  5.77                 15.4                    11.5 
 4   10004                  5.29                 14.0                     9.44
 5   10005                  5.59                 15.2                    10.9 
 6   10006                  6.53                 15.5                    13.0 
 7   10007                  5.44                 14.9                    10.4 
 8   10009                  5.95                 15.7                    11.5 
 9   10010                  5.54                 15.7                    10.9 
10   10011                  5.71                 15.0                    11.1 

What pattern do you see between the average number of inspections per restaurant and the rate of critical violations? (Hint: Note that an inspection can appear across several rows in the Violations data set if multiple violations were associated with the inspection)

ggplot(data = data_agregated_by_zipcode ,
       aes(x = mean_inspector_visits, y = mean_critical_violations)) +
    geom_point() +
    geom_smooth(method = lm) +
    theme(axis.text.y  = element_text(size = 14),
                # Legend position and Axis size 

        axis.text.x   = element_text(size = 14),
        axis.title.y  = element_text(size = 14),
        axis.title.x  = element_text(size = 14),
        # Add borders to the plot
        panel.border = element_rect(colour = "black", 
                                    fill= NA,size = 1.3)) +
    ylab("Critical Violations") + xlab("Inspector Visits")

# Get lat long for each zipcode
zip_long_lat <- geocode_zip(data_agregated_by_zipcode$zipcode) %>% 
        mutate(zipcode = as.integer(zipcode))
# Join agregated data and lat long data
nyc <- 
    inner_join(data_agregated_by_zipcode,zip_long_lat, 
                  by = c("zipcode" = "zipcode")) %>%
    select(zipcode, lat,lng, everything()) %>% 
    drop_na()
# Get map and set palette
nyc_map <- get_map(getbb("New York"), maptype = "toner-background")
my_palette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
ggmap(nyc_map) +
  geom_point(data = nyc, aes(x = lng, y = lat, 
                             color = mean_critical_violations),
             size = 9) +
  scale_colour_gradientn(colours = my_palette(10), limits=c(1, 22)) +
  theme(legend.position = "bottom",
        legend.key.size = unit(2, 'cm'),
        legend.title = element_text(size=30),
        legend.text = element_text(size=30)) +
    labs(color = 'Critical \n Violations')